需求是这样的: 需要获取最近20天的贴吧的主题贴以及直接回复(过滤回复的回复),输出数据到MySQL 这里以百度贴吧-上海吧为例子。
定义全局变量(settings.py):
# -*- coding: utf-8 -*- # Scrapy settings for tieba project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'tieba' SPIDER_MODULES = ['tieba.spiders'] NEWSPIDER_MODULE = 'tieba.spiders' START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7' #START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7' #START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6' TOTAL_DAYS = "20" ITEM_PIPELINES = ['tieba.pipelines.MySQLDBPipeline'] MySQL_SERVER = "localhost" MySQL_SERVER_PORT = 3306 MySQL_SERVER_DB = "tieba" MySQL_SERVER_USER = "mysql" MySQL_SERVER_PWD = "xyz" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:35.0) Gecko/20100101 Firefox/35.0'
数据抓去部分(TiebaSpider.py 只完成主题帖,回复内容还未准备) :
#coding=utf-8 from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from tieba.items import SubjectItem from tieba.items import CommentItem from tieba import settings import scrapy import json class TiebaSpider(CrawlSpider): name = 'tieba' allowed_domains = ['tieba.baidu.com'] #备注:那些带有推广的帖子现在看起来都不是这个域名下的,所以主题文章已经过滤了推广贴 start_urls = [settings.START_URL] #这里假设20天内主题帖数量<1000*50,可以根据实际调整或获取页面上每个主题帖的时间来计算出具体需要多少页! for x in range(0, 1000): start_urls.append(settings.START_URL + "&pn=" + str((x+1) * 50)) rules = [Rule(LinkExtractor(allow=['/p//d+']), 'parse_subject_shanghai')]#这里只解析主题贴 def parse_subject_shanghai(self, response): try: torrent = SubjectItem() torrent['url'] = response.url torrent['id'] = response.url.split('/p')[1].split('/')[1].split('?')[0] torrent['commentNum'] = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[1]/text()").extract()[0] #这里用id定位没有找到content,一个可能原因是用了自定义tag cc torrent['content'] = response.xpath("//*/cc/div/text()").extract()[0] dataField = json.loads(str(response.xpath("//*[@id='j_p_postlist']/div[1]/@data-field").extract()[0])) #很多信息在html source里没有,是在客户端用 js 生成 torrent['created'] = dataField['content']['date'].strip()+":00" torrent['title'] = response.xpath("//*[@id='j_core_title_wrap']/div/h1/text()").extract()[0] torrent['tiebaName'] = response.xpath("//*[@id='container']/div/div[1]/div[2]/div[2]/a/text()").extract()[0].strip() torrent['authorName'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/text()").extract()[0] torrent['authorUrl'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/@href").extract()[0] torrent['authorAvatar'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[1]/div/a/img/@src").extract()[0] if not "http://tieba.baidu.com" in torrent['authorUrl']: torrent['authorUrl'] = "http://tieba.baidu.com" + torrent['authorUrl'] hxs = HtmlXPathSelector(response) subject_post_div = hxs.select("//*/cc/div")[0] imgs = ['','',''] index = 1 for img in subject_post_div.select(".//img/@src"): if index > 3: break imgs[index-1] = img.extract() index += 1 torrent['image1'],torrent['image2'],torrent['image3'] = imgs #到这里已经完成主题帖的解析 totalCommentPage = int(response.xpath("//div[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()[0]) for x in range(2, totalCommentPage): url = torrent['url'] + ("?pn=%s" % x) yield scrapy.Request(url=url, callback=self.parse_comments_shanghai) except: torrent['id'] = None pass yield torrent def parse_comments_shanghai(self,response): try: items = [] print response hxs = HtmlXPathSelector(response) print "---------------------------------------------------" j_p_postlist = hxs.select("//div[@id='j_p_postlist']").select(".//div[@class='l_post l_post_bright ']") print "----------------------------------------got it",j_p_postlist for childNode in j_p_postlist: print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" print childNode.extract() #for content in j_p_postlist.select(".//div[@id='l_post l_post_bright']/text()"): #print '=-===content',content except: for item in items: item['id'] = None pass return items
数据存取部分(只完成主题帖)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import settings from scrapy import log import traceback import MySQLdb import MySQLdb.cursors from twisted.enterprise import adbapi from datetime import datetime def strtodatetime(datestr,format): return datetime.strptime(datestr,format) class MySQLDBPipeline(object): def __init__(self): self.date_time_format = "%Y-%m-%d %H:%M:%S" self.dbpool = adbapi.ConnectionPool('MySQLdb', host = settings.MySQL_SERVER, db = settings.MySQL_SERVER_DB, port = settings.MySQL_SERVER_PORT, user = settings.MySQL_SERVER_USER, passwd = settings.MySQL_SERVER_PWD, cp_reconnect = True, cursorclass = MySQLdb.cursors.DictCursor, charset = 'utf8', use_unicode = True) def process_item(self, item, spider): # run db query in thread pool query = self.dbpool.runInteraction(self._conditional_insert, item).addErrback(self.handle_error) return item def _conditional_insert(self, tx, item): if item.get('id') and item.get('created'): today = datetime.now() postDay = strtodatetime(item.get('created'), self.date_time_format) #从这里限制只更新20天内的数据 if (today - postDay).days <= int(settings.TOTAL_DAYS): args= (item['id'], item['title'], item['url'], item['tiebaName'], item['authorName'], item['authorUrl'], item['authorAvatar'], item['content'], item['created'], item['image1'], item['image2'], item['image3'], item['commentNum'], item['commentNum'] ) sql = '''insert into tieba_articles(id, title, url, tiebaName, authorName, authorUrl, authorAvatar,content,created,image1,image2,image3,commentNum) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s', '%s') ON DUPLICATE KEY UPDATE commentNum = '%s' ''' % args tx.execute(sql) def handle_error(self, e): log.err(e)
忘记了,补上数据结构部分(items.py):
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class SubjectItem(scrapy.Item): id = scrapy.Field() url = scrapy.Field() title = scrapy.Field() tiebaName = scrapy.Field() authorName = scrapy.Field() authorUrl = scrapy.Field() authorAvatar = scrapy.Field() commentNum = scrapy.Field() created = scrapy.Field() content = scrapy.Field() image1 = scrapy.Field() image2 = scrapy.Field() image3 = scrapy.Field() class CommentItem(scrapy.Item): authorName = scrapy.Field() authorUrl = scrapy.Field() authorAvatar = scrapy.Field() content = scrapy.Field() index = scrapy.Field() article_id = scrapy.Field() created = scrapy.Field()
总结: scrapy定义了清晰的层次结构,使得开发者只需要关注业务逻辑本身。 对于分页数据处理,可以使用两种模式: 1)把已知的所有url添加到一个列表 2)使用 yield scrapy.Request(xargs)