安装
pip install scrapy
创建项目
scrapy startproject 项目文件夹
创建蜘蛛文件
scrapy genspider taobao www.taobao.com
运行蜘蛛文件 -o 输出为文件需带,不带在日志打印
scrapy crawl 蜘蛛文件名 -o 保存的文件
settings 设置
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" 伪装成浏览器
CONCURRENT_REQUESTS = 32 设置允许并发数
items文件绑定数据
title = scrapy.Field()
price = scrapy.Field()
蜘蛛文件写法demo
引入
import scrapy
from scrapy import Selector, Request // css控制器 ,访问
from scrapy.http import HtmlResponse //
from pachong.items import PachongItem
设置html对象
def parse(self, response: HtmlResponse):
蜘蛛文件写法
sel = Selector(response)
list_items = sel.css("#content > div > div.article > ol > li")
for list_item in list_items:
pachong_item = PachongItem()
pachong_item['title'] = list_item.css("span.title::text").extract_first()
pachong_item['rank'] = list_item.css("span.rating_num::text").extract_first()
pachong_item['subject'] = list_item.css("span.inq::text").extract_first()
yield pachong_item
通过链接继续爬取更深层次的页面
url_items = sel.css("div.paginator > a::attr(href)");
for item in url_items:
url = response.urljoin(item.extract())
request(url=url)
数据操作管道pipeline文件
def init(self): # 构造方法
self.wb = openpyxl.Workbook() # 创建工作簿
self.ws = self.wd.active # 默认工作簿
self.ws.title = 'Top250' # 修改工作簿名称
self.ws.append(("标题", "评分", "主题")) # 创建表头
钩子方法 ——高级函数
爬取完数据会执行
def close_spider(self, spider):
self.wd.save("TOP250.xlsx")
爬取数据途中执行 item等于爬取的数据
def process_item(self, item, spider):
return item
settings 配置管道
ITEM_PIPELINES = {
"pachong.pipelines.PachongPipeline": 300, //数字小的先执行
}
DOWNLOADER_MIDDLEWARES = {
"pachong.middlewares.PachongDownloaderMiddleware": 543, // 中间件
}
中间件拦截请求
page_source 参数是获取动态数据,js,ajax加载的数据
返回response对象,给蜘蛛程序执行,不会继续下载
def process_response(self, request, response, spider):
Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
self.browser.get(request.url)
return HtmlResponse(url=request.url, body = self.browser.page_source, request = request, encoding = 'utf-8')