【爬虫】输出成excel表格—— xlwt 库的部分用法

（20190813更新）
发现一段更好更简洁的代码

import requests
import re
import json
import xlwt


def main(page):
    url = 'http://bang.dangdang.com/books/fivestars/1-' + str(page)
    html = request_dandan(url)
    items = parse_result(html)

 #   for item in items:
  #      write_item_to_file(item)


def request_dandan(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None


def parse_result(html):
    pattern = re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            'range': item[0],
            'iamge': item[1],
            'title': item[2],
            'recommend': item[3],
            'author': item[4],
            'times': item[5],
            'price': item[6]

        }

    print(item)

def write_item_to_file(item):
    print('开始写入数据 ===> ' + str(item))
    with open('book.txt', 'a', encoding='UTF-8') as f:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
        f.close


if __name__ == '__main__':
    Workbook = xlwt.Workbook()
    sheet = Workbook.add_sheet('当当图书Top500')
    sheet.write(0,0,'排名')
    sheet.write(0,1,'连接')
    sheet.write(0,2,'书名')
    sheet.write(0,3,'推荐')
    sheet.write(0,4,'作者')
    sheet.write(0,5,'销量')
    sheet.write(0,6,'价格')
    Workbook.save('当当图书Top500.xls')

    for i in range(1, 26):
        main(i)

0x00 前言

我们用爬虫收集到信息后，太过杂乱的话可不利于分析。假如能输出成表格的形式会更加直观，有幸的是python有 xlwt 这个写表格的库（相对的有一个叫 xlrd 的读取表格的库），这就很方便了。。

正巧这几天有个朋友遇到了相关的问题找我帮忙，我也趁此即会学习了一下。

0x01 使用格式

sheet.write($row, $col, $content, $Style)

注释：带’$’表示变量

示例

'''
在表格 0 行 3 列处（即 1A ）
以宋体加粗的格式（前提已定义 set_style 函数）
写入字符串 'Hello'
'''
sheet.write(0, 3, 'Hello', set_style(u'宋体', 220, True))

0x02 一次性写入表格

这个是我最开始找到的资料，也是一开始使用的方法

python中使用xlrd、xlwt操作excel表格详解 – Sukie_csdn的博客 – CSDN博客

首先，先定义一个设置单元格样式的函数

'''
设置单元格样式
'''

def set_style(name,height,bold=False):
    style = xlwt.XFStyle() # 初始化样式

    font = xlwt.Font() # 为样式创建字体
    font.name = name # 'Times New Roman'
    font.bold = bold
    font.color_index = 4
    font.height = height

    style.font = font

    return style

接着创建 工作薄 和 sheet

def write_excel():
    f = xlwr.Workbook() # 创建工作薄

    # 创建 sheet ，可根据需要创建多个
    sheet1 = f.add_sheet('sheet1', cell_overwrite_ok=True)

写入第一行加粗作为分类的标题

    # 第一行
    row0 = ['range', 'image', 'title', 'recommend', 'author', 'times', 'price']
    for i in range(0, len(row0)):
        sheet.write(0, i, row0[i], set_style(u'宋体', 220, True);

不断循环往内对应单元格写入数据

    row = 1 # 从行 2 开始写（行 1 是分类标题）
    for item in items:
        print(row)
        sheet.write(row, 0, item['range'], set_style(u'宋体', 220, False))
        sheet.write(row, 0, item['image'], set_style(u'宋体', 220, False))
        sheet.write(row, 0, item['title'], set_style(u'宋体', 220, False))
        sheet.write(row, 0, item['recommend'], set_style(u'宋体', 220, False))
        sheet.write(row, 0, item['aauthor'], set_style(u'宋体', 220, False))
        sheet.write(row, 0, item['times'], set_style(u'宋体', 220, False))
        sheet.write(row, 0, item['price'], set_style(u'宋体', 220, False))
        row = row+1

    f.save('demo1.xlsx')

至此，我们可以完成一次性的写入并保存下来

0x03 往现有表格后添加数据

前面的方法虽然能基本上解决问题，但第二次写入的时候会把前面已经填写过的单元格清空，并不能得到我想要的效果。所以后面我又去找了一下能够往现有表格后添加数据的方法。

Python怎么向xls追加信息_百度知道

于是，我在中间添加了一段这个代码

# 使数据每次写入都以追加的方式，防止覆盖前面的数据
# 可注释掉此段进行尝试
from xlrd import open_workbook
from xlutils.copy import copy

rb = open_workbook('demo1.xlsx')
f = copy(rb)
sheet = f.get_sheet(0)

同时为了行数能累积下去，我把 row 这个变量也改成了全局变量

global row

以下是完整代码

#-*- coding:utf-8 -*-
import requests
import re
import json
import xlwt


def main(page):
    url = 'http://bang.dangdang.com/books/fivestars/1-' + str(page)
    html = request_dandan(url)
    items = parse_result(html)
    write_item_to_excel(items)


def request_dandan(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None


def parse_result(html):
    pattern = re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            'range': item[0],
            'image': item[1],
            'title': item[2],
            'recommend': item[3],
            'author': item[4],
            'times': item[5],
            'price': item[6]
        }

    return items
    print(item)

def write_item_to_file(item):
    print('开始写入数据 ===> ' + str(item))
    with open('book.txt', 'a', encoding='UTF-8') as f:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
        f.close


def set_excel_style(name, height, bold=False):
    style = xlwt.XFStyle()
    font = xlwt.Font()
    font.name = name
    font.bold = bold
    font.colour_index = 0
    font.height = height
    style.font = font
    return style


def write_item_to_excel(items):
    f = xlwt.Workbook()
    sheet = f.add_sheet('sheet1', cell_overwrite_ok=True)
    # 第一行
    row0 = ['range', 'image', 'title', 'recommend', 'author', 'times', 'price']
    for i in range(0,len(row0)):
        sheet.write(0,i,row0[i],set_excel_style(u'宋体',220,True))
        # 数据
        # 使数据每次写入都以追加的方式，防止覆盖前面数据
        # 可注释掉此段进行尝试
        from xlrd import open_workbook
        from xlutils.copy import copy
        rb = open_workbook('demo1.xlsx')
        f = copy(rb)
        sheet = f.get_sheet(0)
        # 26 页，每页 20 本书 
        # 使用全局变量 row 记录写到第几行
        global row
        for item in items:
            print(row)
            sheet.write(row,0,item['range'],set_excel_style(u'宋体',220,False))
            sheet.write(row,1,item['image'],set_excel_style(u'宋体',220,False))
            sheet.write(row,2,item['title'],set_excel_style(u'宋体',220,False))
            sheet.write(row,3,item['recommend'],set_excel_style(u'宋体',220,False))
            sheet.write(row,4,item['author'],set_excel_style(u'宋体',220,False))
            sheet.write(row,5,item['times'],set_excel_style(u'宋体',220,False))
            sheet.write(row,6,item['price'],set_excel_style(u'宋体',220,False))
            row = row + 1
            f.save('demo1.xlsx')

if __name__ == '__main__':
    global row
    row = 1 # 表格第二行开始写 for i in range(1, 26): main(i) 
    for i in range(1, 26):
        main(i)

0x04 结尾

end.

本作品采用知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议进行许可。

【爬虫】输出成excel表格—— xlwt 库的部分用法

于2019年6月11日由Rbit发布

0x00 前言

0x01 使用格式

0x02 一次性写入表格

0x03 往现有表格后添加数据

0x04 结尾

发表评论取消回复

Python

【兔叽】Python 解决多余空格和格式转换问题（三）

Python

【兔叽】Python BeautifulSoup4（二）

Python

【兔叽】python 抓取网页源码（一）

0x00 前言

0x01 使用格式

0x02 一次性写入表格

0x03 往现有表格后添加数据

0x04 结尾

发表评论 取消回复

相关文章

Python

【兔叽】Python 解决多余空格和格式转换问题 （三）

Python

【兔叽】Python BeautifulSoup4（二）

Python

【兔叽】python 抓取网页源码（一）

发表评论取消回复

【兔叽】Python 解决多余空格和格式转换问题（三）